# ------------------------------------------------------------------------------
# NOTE:
# This script fits the Wordfish models to estimate parties' committee-specific
# positions. However, due to changes in the quanteda and quanteda.textmodels
# packages over time, this script may not run successfully with current package
# versions. As explained in the README, the Wordfish estimation was originally
# conducted using versions of quanteda and related packages from around
# April 2021.
# ------------------------------------------------------------------------------

require(stringr)
require(dplyr)
require(tidyr)
require(quanteda)
require(quanteda.textmodels)

# Wordfish estimation
year <- 1959
while (year <= 2019) {
  # load dfm
  dfm <- readRDS(paste0("dfm/dfm_", year, ".rds"))
  dfm@docvars$nameOfHouse <- ifelse(dfm@docvars$nameOfHouse == "衆議院", "HOR", "HOC")
  
  # create document ID
  dfm@docvars$ID <- paste(dfm@docvars$date, 
                          dfm@docvars$nameOfHouse, 
                          dfm@docvars$nameOfMeeting, 
                          dfm@docvars$speaker, 
                          dfm@docvars$party_jp)
  dfm <- dfm_group(dfm, groups = "ID")
  
  # create party label
  dfm@docvars$party <- paste(dfm@docvars$party_en, dfm@docvars$nameOfHouse, sep = "_")
  dfm <- dfm_subset(dfm, !str_detect(party_en, pattern = "Independent"))
  dfm <- dfm_subset(dfm, !str_detect(party_en, pattern = "Minor Party"))
  
  # prepare data objects
  party <- length(unique(dfm@docvars$party))
  party.names <- unique(dfm@docvars$party)
  party.names <- party.names[order(party.names)]
  committee <- length(unique(dfm@docvars$nameOfMeeting))
  committee.names <- unique(dfm@docvars$nameOfMeeting)
  committee.names <- committee.names[order(committee.names)]
  
  # estimate committee-specific party positions with Wordfish
  committee.specific.positions <- matrix(NA, party, committee)
  rownames(committee.specific.positions) <- party.names
  colnames(committee.specific.positions) <- committee.names
  
  # fit wordfish models
  wordfish.result <- list()
  for (j in 1:committee) {
    ## dfm for committee j
    committee.dfm <- dfm[dfm@docvars$nameOfMeeting == committee.names[j],]
    ## aggregate the dfm to the party level
    dfm.matrix <- matrix(NA, party, ncol(dfm))
    rownames(dfm.matrix) <- party.names
    colnames(dfm.matrix) <- colnames(dfm)
    for (k in 1:party) {
      dfm.matrix[k, ] <- round(colSums(committee.dfm[committee.dfm@docvars$party == party.names[k],]))
    }
    ## drop terms used by fewer than two parties
    dfm.matrix <- dfm.matrix[, -1 * which(colSums(dfm.matrix > 0) < 2)]
    dfm.matrix <- dfm.matrix[rowSums(dfm.matrix) > 0, ]
    ## convert the matrix to a dfm
    pseudo.text <- rep("", nrow(dfm.matrix))
    for (k in 1:nrow(dfm.matrix)) {
      for (l in 1:ncol(dfm.matrix)) {
        pseudo.text[k] <- paste(pseudo.text[k], 
                                paste(rep(colnames(dfm.matrix)[l], dfm.matrix[k, l]), collapse = " "))
      }
    }
    wordfish.dfm <- dfm(tokens(pseudo.text, what = "fastestword"))
    ## drop terms that appear in fewer than 1% of documents
    frequently.used.terms <- colSums(wordfish.dfm > 0) > round(nrow(wordfish.dfm) / 100)
    wordfish.dfm <- dfm_keep(
      wordfish.dfm, 
      wordfish.dfm@Dimnames$features[frequently.used.terms], 
      valuetype = "fixed"
    )
    ## fit the Wordfish model
    set.seed(12345)
    dir.ldp.hor <- grep(rownames(dfm.matrix), pattern = "LDP_HOR")
    dir.jcp.hor <- grep(rownames(dfm.matrix), pattern = "JCP_HOR")
    dir.jsp.hor <- grep(rownames(dfm.matrix), pattern = "JSP_HOR")
    dir.ldp.hoc <- grep(rownames(dfm.matrix), pattern = "LDP_HOC")
    dir.jcp.hoc <- grep(rownames(dfm.matrix), pattern = "JCP_HOC")
    dir.jsp.hoc <- grep(rownames(dfm.matrix), pattern = "JSP_HOC")
    if (length(dir.ldp.hor) != 0 & (length(dir.jsp.hor) != 0 | length(dir.jcp.hor) != 0)) {
      if (length(dir.jcp.hor) == 0) {
        wordfish.result[[j]] <- textmodel_wordfish(wordfish.dfm, 
                                                   dir = c(dir.jsp.hor, dir.ldp.hor))
      } else {
        wordfish.result[[j]] <- textmodel_wordfish(wordfish.dfm, 
                                                   dir = c(dir.jcp.hor, dir.ldp.hor))
      }
    }
    if (length(dir.ldp.hor) == 0 | (length(dir.jsp.hor) == 0 & length(dir.jcp.hor) == 0)) {
      if (length(dir.jcp.hoc) == 0) {
        wordfish.result[[j]] <- textmodel_wordfish(wordfish.dfm, 
                                                   dir = c(dir.jsp.hoc, dir.ldp.hoc))
      } else {
        wordfish.result[[j]] <- textmodel_wordfish(wordfish.dfm, 
                                                   dir = c(dir.jcp.hoc, dir.ldp.hoc))
      }
    }
    # store committee-specific party positions
    party.ID <- match(rownames(dfm.matrix), party.names)
    committee.specific.positions[party.ID, j] <- wordfish.result[[j]]$theta
  }
  
  # save outputs
  saveRDS(wordfish.result, file = paste0("Wordfish/Wordfish_results_", year, ".rds"))
  saveRDS(committee.specific.positions, file = paste0("Wordfish/position_", year, ".rds"))
  print(paste0("completed processing for", year, " data"))
  year <- year + 1
}
